I am following the tidy text format suggested in Text Mining with R: A Tidy Approach by Julia Silge and David Robinson (https://www.tidytextmining.com/).

Here are the key ideas related to tidy text

Other data structures (in contrast to tidy text)

Tokenization code example

Example code for tokenization

library(tidyverse)
library(tidytext)

# Raw text
text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")

text

#Put the raw text into a data frame
text_df <- tibble(line = 1:4, text = text)

text_df

# Break up into tokens
text_df %>% unnest_tokens(word, text)

Observe the following about the unnest_tokens function:

We can remove stop words

data(stop_words)
stop_words

# Remove the stop words
text_df %>% unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")

# Count the word frequencies
text_df %>% unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

Preparation of the Reback text message library

Load the library

rm(list = ls())
library(readxl)

# Load the text message library
# Note that this library is modified to give the Pre and Post Messages unique identifiers
txtMsgLibrary <- readxl::read_xlsx("C:/Users/nlbfr/Dropbox/UNC/Causal NLP/Reback_TxtLibrary/Reback_Project Tech Support Text Message Library_NF.xlsx", 
                  sheet = "Library", skip = 24, col_names = c("msgID", "textMsg"))

#

# Remove the blank lines
## Number of lines expected to be blanks
table(is.na(txtMsgLibrary$textMsg))

FALSE  TRUE 
  660     9 
# Filter out the blank lines
txtMsgLibrary <- txtMsgLibrary %>% filter(!is.na(textMsg))

Tidy up the text messages

library(tidyverse)
library(tidytext)

# Create tidy text and remove stop words
tidyTxt <- txtMsgLibrary %>% unnest_tokens(word, textMsg) %>%
  anti_join(stop_words, by = "word")

# Create the Document-term matrix
## Creates a sparse matrix
## txtDTMsparse <- tidyTxt %>% count(msgID, word) %>% cast_dtm(term = word, document = msgID, value = n)
## The Gibbs sampler was made for a non-sparse matrix
txtDTM <- tidyTxt %>%
  group_by(msgID) %>%
  count(word) %>%
  spread(word, n)
txtDTM[is.na(txtDTM)] <-0
txtDTM
LS0tDQp0aXRsZTogIkRhdGEgcHJlcGFyYXRpb24gZm9yIFJlYmFjayB0ZXh0IG1zZyBsaWJyYXJ5Ig0KYXV0aG9yOiAiTmlra2kgRnJlZW1hbiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCkkgYW0gZm9sbG93aW5nIHRoZSB0aWR5IHRleHQgZm9ybWF0IHN1Z2dlc3RlZCBpbiBfVGV4dCBNaW5pbmcgd2l0aCBSOiBBIFRpZHkgQXBwcm9hY2hfIGJ5IEp1bGlhIFNpbGdlIGFuZCBEYXZpZCBSb2JpbnNvbiAoaHR0cHM6Ly93d3cudGlkeXRleHRtaW5pbmcuY29tLykuDQoNCkhlcmUgYXJlIHRoZSBrZXkgaWRlYXMgcmVsYXRlZCB0byB0aWR5IHRleHQNCg0KICAqIFRoZSB0aWR5IHRleHQgZm9ybWF0IGlzIGRlc2NyaWJlZCBhcyAqYSB0YWJsZSB3aXRoIG9uZS10b2tlbi1wZXItcm93Ki4NCiAgKiBBICp0b2tlbiogaXMgYSBtZWFuaW5nZnVsIHVuaXQgb2YgdGV4dCwgc3VjaCBhcyBhIHdvcmQsIHRoYXQgd2UgYXJlIGludGVyZXN0ZWQgaW4gdXNpbmcuDQogICogKlRva2VuaXphdGlvbiogaXMgdGhlIHByb2Nlc3Mgb2Ygc3BsaXR0aW5nIHRleHQgaW50byB0b2tlbnMuDQogIA0KT3RoZXIgZGF0YSBzdHJ1Y3R1cmVzIChpbiBjb250cmFzdCB0byB0aWR5IHRleHQpDQoNCiAgKiBTdHJpbmcNCiAgKiBDb3JwdXM6IFJhdyBzdHJpbmdzIGFubm90YXRlZCB3aXRoIGFkZGl0aW9uYWwgbWV0YWRhdGEgYW5kIGRldGFpbHMNCiAgKiBEb2N1bWVudC10ZXJtIG1hdHJpeDogYSBzcGFyc2UgbWF0cml4IGRlc2NyaWJpbmcgYSBjb2xsZWN0aW9uIChpLmUuLCBhIGNvcnB1cykgb2YgZG9jdW1lbnRzIHdpdGggb25lIHJvdyBmb3IgZWFjaCBkb2N1bWVudCBhbmQgb25lIGNvbHVtbiBmb3IgZWFjaCB0ZXJtLiBUaGUgdmFsdWUgaW4gdGhlIG1hdHJpeCBpcyB0eXBpY2FsbHkgd29yZCBjb3VudCBvciB0Zi1pZGYuDQoNCiMjIFRva2VuaXphdGlvbiBjb2RlIGV4YW1wbGUNCkV4YW1wbGUgY29kZSBmb3IgdG9rZW5pemF0aW9uDQpgYGB7cn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeSh0aWR5dGV4dCkNCg0KIyBSYXcgdGV4dA0KdGV4dCA8LSBjKCJCZWNhdXNlIEkgY291bGQgbm90IHN0b3AgZm9yIERlYXRoIC0iLA0KICAgICAgICAgICJIZSBraW5kbHkgc3RvcHBlZCBmb3IgbWUgLSIsDQogICAgICAgICAgIlRoZSBDYXJyaWFnZSBoZWxkIGJ1dCBqdXN0IE91cnNlbHZlcyAtIiwNCiAgICAgICAgICAiYW5kIEltbW9ydGFsaXR5IikNCg0KdGV4dA0KDQojUHV0IHRoZSByYXcgdGV4dCBpbnRvIGEgZGF0YSBmcmFtZQ0KdGV4dF9kZiA8LSB0aWJibGUobGluZSA9IDE6NCwgdGV4dCA9IHRleHQpDQoNCnRleHRfZGYNCg0KIyBCcmVhayB1cCBpbnRvIHRva2Vucw0KdGV4dF9kZiAlPiUgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQ0KYGBgDQoNCk9ic2VydmUgdGhlIGZvbGxvd2luZyBhYm91dCB0aGUgYHVubmVzdF90b2tlbnNgIGZ1bmN0aW9uOg0KDQogICogQ29sdW1ucyBzdWNoIGFzIHRoZSBsaW5lIG51bWJlciBhcmUgcmV0YWluZWQNCiAgKiBQdW5jdHVhdGlvbiBoYXMgYmVlbiBzdHJpcHBlZA0KICAqIFdvcmRzIGFyZSBhbGwgbG93ZXIgY2FzZSAodGhpcyBjYW4gYmUgY29udHJvbGxlZCkNCg0KV2UgY2FuIHJlbW92ZSBzdG9wIHdvcmRzDQpgYGB7cn0NCmRhdGEoc3RvcF93b3JkcykNCnN0b3Bfd29yZHMNCg0KIyBSZW1vdmUgdGhlIHN0b3Agd29yZHMNCnRleHRfZGYgJT4lIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lDQogIGFudGlfam9pbihzdG9wX3dvcmRzLCBieSA9ICJ3b3JkIikNCg0KIyBDb3VudCB0aGUgd29yZCBmcmVxdWVuY2llcw0KdGV4dF9kZiAlPiUgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KSAlPiUNCiAgY291bnQod29yZCwgc29ydCA9IFRSVUUpDQpgYGANCg0KIyMgUHJlcGFyYXRpb24gb2YgdGhlIFJlYmFjayB0ZXh0IG1lc3NhZ2UgbGlicmFyeQ0KDQpMb2FkIHRoZSBsaWJyYXJ5DQpgYGB7cn0NCnJtKGxpc3QgPSBscygpKQ0KbGlicmFyeShyZWFkeGwpDQoNCiMgTG9hZCB0aGUgdGV4dCBtZXNzYWdlIGxpYnJhcnkNCiMgTm90ZSB0aGF0IHRoaXMgbGlicmFyeSBpcyBtb2RpZmllZCB0byBnaXZlIHRoZSBQcmUgYW5kIFBvc3QgTWVzc2FnZXMgdW5pcXVlIGlkZW50aWZpZXJzDQp0eHRNc2dMaWJyYXJ5IDwtIHJlYWR4bDo6cmVhZF94bHN4KCJDOi9Vc2Vycy9ubGJmci9Ecm9wYm94L1VOQy9DYXVzYWwgTkxQL1JlYmFja19UeHRMaWJyYXJ5L1JlYmFja19Qcm9qZWN0IFRlY2ggU3VwcG9ydCBUZXh0IE1lc3NhZ2UgTGlicmFyeV9ORi54bHN4IiwgDQogICAgICAgICAgICAgICAgICBzaGVldCA9ICJMaWJyYXJ5Iiwgc2tpcCA9IDI0LCBjb2xfbmFtZXMgPSBjKCJtc2dJRCIsICJ0ZXh0TXNnIikpDQoNCiMNCg0KIyBSZW1vdmUgdGhlIGJsYW5rIGxpbmVzDQojIyBOdW1iZXIgb2YgbGluZXMgZXhwZWN0ZWQgdG8gYmUgYmxhbmtzDQp0YWJsZShpcy5uYSh0eHRNc2dMaWJyYXJ5JHRleHRNc2cpKQ0KIyBGaWx0ZXIgb3V0IHRoZSBibGFuayBsaW5lcw0KdHh0TXNnTGlicmFyeSA8LSB0eHRNc2dMaWJyYXJ5ICU+JSBmaWx0ZXIoIWlzLm5hKHRleHRNc2cpKQ0KIyBTYW5pdHkgY2hlY2sNCmRpbSh0eHRNc2dMaWJyYXJ5KQ0KYGBgDQoNCiMjIFRpZHkgdXAgdGhlIHRleHQgbWVzc2FnZXMNCmBgYHtyfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KDQojIENyZWF0ZSB0aWR5IHRleHQgYW5kIHJlbW92ZSBzdG9wIHdvcmRzDQp0aWR5VHh0IDwtIHR4dE1zZ0xpYnJhcnkgJT4lIHVubmVzdF90b2tlbnMod29yZCwgdGV4dE1zZykgJT4lDQogIGFudGlfam9pbihzdG9wX3dvcmRzLCBieSA9ICJ3b3JkIikNCg0KIyBDcmVhdGUgdGhlIERvY3VtZW50LXRlcm0gbWF0cml4DQojIyBDcmVhdGVzIGEgc3BhcnNlIG1hdHJpeA0KIyMgdHh0RFRNc3BhcnNlIDwtIHRpZHlUeHQgJT4lIGNvdW50KG1zZ0lELCB3b3JkKSAlPiUgY2FzdF9kdG0odGVybSA9IHdvcmQsIGRvY3VtZW50ID0gbXNnSUQsIHZhbHVlID0gbikNCiMjIFRoZSBHaWJicyBzYW1wbGVyIHdhcyBtYWRlIGZvciBhIG5vbi1zcGFyc2UgbWF0cml4DQp0eHREVE0gPC0gdGlkeVR4dCAlPiUNCiAgZ3JvdXBfYnkobXNnSUQpICU+JQ0KICBjb3VudCh3b3JkKSAlPiUNCiAgc3ByZWFkKHdvcmQsIG4pDQp0eHREVE1baXMubmEodHh0RFRNKV0gPC0wDQp0eHREVE0NCg0KIyBTYXZlIHRoZSBtYXRyaXggZm9yIHVzZSB3aXRoIHRoZSBzYW1wbGVyDQp3cml0ZV9jc3YodHh0RFRNLCBwYXRoID0gIkM6L1VzZXJzL25sYmZyL0Ryb3Bib3gvVU5DL0NhdXNhbCBOTFAvUmViYWNrX1R4dExpYnJhcnkvUmViYWNrRFRNLmNzdiIpDQpgYGANCg==